library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(mvtnorm)
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(ggthemes)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
banktrain <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/Training_Test_Splits/banktrain_raw.csv", header = TRUE, sep = ",", strip.white = TRUE)
banktest <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/Training_Test_Splits/banktest_raw.csv", header = TRUE, sep = ",", strip.white = TRUE)
skim(banktrain)
## Skim summary statistics
## n obs: 4718
## n variables: 22
##
## ── Variable type:factor ────────────────────────────────────────────────────────
## variable missing complete n n_unique
## contact 0 4718 4718 2
## day_of_week 0 4718 4718 5
## default 0 4718 4718 2
## education 0 4718 4718 8
## housing 0 4718 4718 3
## job 0 4718 4718 12
## loan 0 4718 4718 3
## marital 0 4718 4718 4
## month 0 4718 4718 10
## poutcome 0 4718 4718 3
## Subscription 0 4718 4718 2
## top_counts ordered
## cel: 3391, tel: 1327, NA: 0 FALSE
## thu: 1040, wed: 953, mon: 941, tue: 922 FALSE
## no: 3962, unk: 756, NA: 0 FALSE
## uni: 1544, hig: 1079, bas: 606, pro: 569 FALSE
## yes: 2481, no: 2129, unk: 108, NA: 0 FALSE
## adm: 1311, blu: 893, tec: 698, ser: 406 FALSE
## no: 3925, yes: 685, unk: 108, NA: 0 FALSE
## mar: 2710, sin: 1485, div: 506, unk: 17 FALSE
## may: 1282, jul: 783, aug: 699, jun: 602 FALSE
## non: 3707, fai: 523, suc: 488, NA: 0 FALSE
## no: 2362, yes: 2356, NA: 0 FALSE
##
## ── Variable type:integer ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## age 0 4718 4718 40.34 11.85 17 31 38
## campaign 0 4718 4718 2.34 2.36 1 1 2
## duration 0 4718 4718 385.22 356.47 4 143 266
## pdays 0 4718 4718 886.8 314.44 0 999 999
## previous 0 4718 4718 0.31 0.69 0 0 0
## row 0 4718 4718 24821.99 12454.11 7 14311 27457.5
## p75 p100 hist
## 48 98 ▂▇▅▃▁▁▁▁
## 3 43 ▇▁▁▁▁▁▁▁
## 517 3631 ▇▂▁▁▁▁▁▁
## 999 999 ▁▁▁▁▁▁▁▇
## 0 6 ▇▂▁▁▁▁▁▁
## 36756.25 41187 ▂▂▃▃▂▃▃▇
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## cons_conf_idx 0 4718 4718 -40.18 5.33 -50.8 -42.7
## cons_price_idx 0 4718 4718 93.48 0.64 92.2 92.89
## emp_var_rate 0 4718 4718 -0.49 1.73 -3.4 -1.8
## euribor3m 0 4718 4718 2.97 1.89 0.63 1.24
## nr_employed 0 4718 4718 5136.19 86.48 4963.6 5076.2
## p50 p75 p100 hist
## -41.8 -36.4 -26.9 ▁▆▆▆▇▂▂▁
## 93.44 93.99 94.77 ▂▁▇▅▁▇▁▂
## -0.1 1.4 1.4 ▃▁▆▁▁▁▁▇
## 4.02 4.96 5.04 ▅▅▁▁▁▁▁▇
## 5191 5228.1 5228.1 ▂▂▁▂▅▁▃▇
nrow(banktrain)
## [1] 4718
skim(banktest)
## Skim summary statistics
## n obs: 36470
## n variables: 22
##
## ── Variable type:factor ────────────────────────────────────────────────────────
## variable missing complete n n_unique
## contact 0 36470 36470 2
## day_of_week 0 36470 36470 5
## default 0 36470 36470 3
## education 0 36470 36470 8
## housing 0 36470 36470 3
## job 0 36470 36470 12
## loan 0 36470 36470 3
## marital 0 36470 36470 4
## month 0 36470 36470 10
## poutcome 0 36470 36470 3
## Subscription 0 36470 36470 2
## top_counts ordered
## cel: 22753, tel: 13717, NA: 0 FALSE
## thu: 7583, mon: 7573, wed: 7181, tue: 7168 FALSE
## no: 28626, unk: 7841, yes: 3, NA: 0 FALSE
## uni: 10624, hig: 8436, bas: 5439, pro: 4674 FALSE
## yes: 19095, no: 16493, unk: 882, NA: 0 FALSE
## adm: 9111, blu: 8361, tec: 6045, ser: 3563 FALSE
## no: 30025, yes: 5563, unk: 882, NA: 0 FALSE
## mar: 22218, sin: 10083, div: 4106, unk: 63 FALSE
## may: 12487, jul: 6391, aug: 5479, jun: 4716 FALSE
## non: 31856, fai: 3729, suc: 885, NA: 0 FALSE
## no: 34186, yes: 2284, NA: 0 FALSE
##
## ── Variable type:integer ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## age 0 36470 36470 39.98 10.22 17 32 38
## campaign 0 36470 36470 2.6 2.82 1 1 2
## duration 0 36470 36470 241.86 239.02 0 99 172
## pdays 0 36470 36470 972.27 160.72 0 999 999
## previous 0 36470 36470 0.16 0.46 0 0 0
## row 0 36470 36470 20047.6 11704.32 1 9915.25 19923.5
## p75 p100 hist
## 47 98 ▂▇▆▃▁▁▁▁
## 3 56 ▇▁▁▁▁▁▁▁
## 300 4918 ▇▁▁▁▁▁▁▁
## 999 999 ▁▁▁▁▁▁▁▇
## 0 7 ▇▁▁▁▁▁▁▁
## 30058.75 41188 ▇▇▇▇▇▇▇▆
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## cons_conf_idx 0 36470 36470 -40.54 4.53 -50.8 -42.7
## cons_price_idx 0 36470 36470 93.59 0.57 92.2 93.08
## emp_var_rate 0 36470 36470 0.16 1.53 -3.4 -1.8
## euribor3m 0 36470 36470 3.71 1.7 0.63 1.41
## nr_employed 0 36470 36470 5171.03 69.2 4963.6 5099.1
## p50 p75 p100 hist
## -41.8 -36.4 -26.9 ▁▅▆▃▇▁▁▁
## 93.92 93.99 94.77 ▁▁▅▅▁▇▁▂
## 1.1 1.4 1.4 ▁▁▃▁▁▁▁▇
## 4.86 4.96 5.04 ▁▃▁▁▁▁▁▇
## 5191 5228.1 5228.1 ▁▁▁▁▃▁▃▇
nrow(banktest)
## [1] 36470
Upon completion of our first EDA we will resume the EDA in order to accomplish an LDA model
- A LDA does not deal with categorical varaibles as predictors. Therefore we will remove the categorical variables from our train data set
- Including removal of duration since it is considered a post variable and not a pre variable for prediction
- Will only include continuous variables found from initial EDA
#Remove categorical predictors
banktrain2 <- dplyr::select(banktrain, -c("age", "duration", "job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"))
banktrain3 <- dplyr::select(banktrain2, -c("pdays", "nr_employed", "emp_var_rate", "row", "previous", "campaign"))
skim(banktrain3)
## Skim summary statistics
## n obs: 4718
## n variables: 4
##
## ── Variable type:factor ────────────────────────────────────────────────────────
## variable missing complete n n_unique top_counts
## Subscription 0 4718 4718 2 no: 2362, yes: 2356, NA: 0
## ordered
## FALSE
##
## ── Variable type:numeric ───────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## cons_conf_idx 0 4718 4718 -40.18 5.33 -50.8 -42.7 -41.8
## cons_price_idx 0 4718 4718 93.48 0.64 92.2 92.89 93.44
## euribor3m 0 4718 4718 2.97 1.89 0.63 1.24 4.02
## p75 p100 hist
## -36.4 -26.9 ▁▆▆▆▇▂▂▁
## 93.99 94.77 ▂▁▇▅▁▇▁▂
## 4.96 5.04 ▅▅▁▁▁▁▁▇
invisible(view(banktrain3))
Paris Scatter Plot
- Pairs scatter plot to see if there is any separation
- We don’t see a lot so we break up the data set for larger visualizations
banktrain3 %>% pairs(,col=banktrain3$Subscription)

LDA
mylda <- lda(Subscription~.,data=banktrain3)
#Predictions can come in many forms, the class form provides the categorical level of your response
Pred <- predict(mylda,newdata = banktest)$class
Truth <- banktest$Subscription
x <- table(Pred,Truth) #Creating confusion matrix
x
## Truth
## Pred no yes
## no 24665 666
## yes 9521 1618
#Confusion matrix function
CMlda <- confusionMatrix(Pred,Truth)
CMlda
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 24665 666
## yes 9521 1618
##
## Accuracy : 0.7207
## 95% CI : (0.716, 0.7253)
## No Information Rate : 0.9374
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7215
## Specificity : 0.7084
## Pos Pred Value : 0.9737
## Neg Pred Value : 0.1453
## Prevalence : 0.9374
## Detection Rate : 0.6763
## Detection Prevalence : 0.6946
## Balanced Accuracy : 0.7150
##
## 'Positive' Class : no
##
QDA
myqda <- qda(Subscription~.,data=banktrain3)
#Predictions can come in many forms, the class form provides the categorical level of your response
PredQ <- predict(myqda,newdata = banktest)$class
TruthQ <- banktest$Subscription
x <- table(PredQ,TruthQ) #Creating confusion matrix
x
## TruthQ
## PredQ no yes
## no 31607 1204
## yes 2579 1080
CMqda <- confusionMatrix(PredQ, TruthQ)
CMqda
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 31607 1204
## yes 2579 1080
##
## Accuracy : 0.8963
## 95% CI : (0.8931, 0.8994)
## No Information Rate : 0.9374
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3103
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9246
## Specificity : 0.4729
## Pos Pred Value : 0.9633
## Neg Pred Value : 0.2952
## Prevalence : 0.9374
## Detection Rate : 0.8667
## Detection Prevalence : 0.8997
## Balanced Accuracy : 0.6987
##
## 'Positive' Class : no
##
3D Graph of Continuous Variables (predictors) colored by Subscription (dependent variable)
fig <- plot_ly(banktrain, x= ~cons_price_idx, y= ~cons_conf_idx, z= ~euribor3m, color = ~Subscription, colors = c('#BF382A', '#228B22'))
fig2 <- fig %>% add_markers()
fig3 <- fig2 %>% layout(scene = list(xaxis = list(title = 'Consumer Price Indx'),
yaxis = list(title = 'Consumer Confidence Indx'),
zaxis = list(title = '3 Month Rate')))
fig3